Data Inspection¶

In [49]:
import plotly
plotly.offline.init_notebook_mode()
In [50]:
import pandas as pd
from tqdm import tqdm

df_list = list()
chunk_iter = pd.read_csv(
    "../data/Total_Data_10Y_Top24.csv", 
    chunksize=100000, 
    dtype = {"CANCELLATION_CODE": str}
)
for chunk in tqdm(chunk_iter):
    df_list.append(chunk)
df = pd.concat(df_list)
df.head()
130it [00:28,  4.52it/s]
Out[50]:
FL_DATE OP_UNIQUE_CARRIER TAIL_NUM ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_MARKET_ID ORIGIN ORIGIN_CITY_NAME DEST_AIRPORT_SEQ_ID DEST_CITY_MARKET_ID DEST ... TAXI_IN ARR_TIME ARR_DELAY CANCELLED CANCELLATION_CODE CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY
0 2014-07-01 AA N002AA 1105703 31057 CLT Charlotte, NC 1129803 30194 DFW ... 28.0 1214.0 9.0 0.0 NaN NaN NaN NaN NaN NaN
1 2014-07-01 AA N002AA 1129803 30194 DFW Dallas/Fort Worth, TX 1105703 31057 CLT ... 13.0 945.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
2 2014-07-01 AA N004AA 1039705 30397 ATL Atlanta, GA 1129803 30194 DFW ... 6.0 1341.0 -9.0 0.0 NaN NaN NaN NaN NaN NaN
3 2014-07-01 AA N004AA 1129803 30194 DFW Dallas/Fort Worth, TX 1039705 30397 ATL ... 7.0 1159.0 4.0 0.0 NaN NaN NaN NaN NaN NaN
4 2014-07-01 AA N004AA 1129803 30194 DFW Dallas/Fort Worth, TX 1039705 30397 ATL ... 6.0 2317.0 2.0 0.0 NaN NaN NaN NaN NaN NaN

5 rows × 24 columns

In [51]:
df.describe()
Out[51]:
ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_MARKET_ID DEST_AIRPORT_SEQ_ID DEST_CITY_MARKET_ID DEP_TIME DEP_DELAY TAXI_OUT TAXI_IN ARR_TIME ARR_DELAY CANCELLED CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY
count 1.292656e+07 1.292656e+07 1.292656e+07 1.292656e+07 1.271494e+07 1.271486e+07 1.271048e+07 1.270744e+07 1.270744e+07 1.268139e+07 1.292656e+07 2.559603e+06 2.559603e+06 2.559603e+06 2.559603e+06 2.559603e+06
mean 1.298824e+06 3.161338e+04 1.298812e+06 3.161370e+04 1.332409e+03 1.127152e+01 1.722694e+01 8.678440e+00 1.471801e+03 5.409690e+00 1.675551e-02 2.041558e+01 2.650336e+00 1.520350e+01 1.268451e-01 2.413592e+01
std 1.453419e+05 1.168922e+03 1.453231e+05 1.168763e+03 5.192380e+02 4.514114e+01 9.141093e+00 6.704624e+00 5.554134e+02 4.738376e+01 1.283540e-01 5.935956e+01 2.208897e+01 3.191662e+01 3.210375e+00 5.163883e+01
min 1.039705e+06 3.019400e+04 1.039705e+06 3.019400e+04 1.000000e+00 -2.340000e+02 0.000000e+00 1.000000e+00 1.000000e+00 -2.380000e+02 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
25% 1.129806e+06 3.046600e+04 1.129806e+06 3.046600e+04 9.060000e+02 -5.000000e+00 1.200000e+01 5.000000e+00 1.052000e+03 -1.400000e+01 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
50% 1.289208e+06 3.145400e+04 1.289208e+06 3.145400e+04 1.324000e+03 -1.000000e+00 1.500000e+01 7.000000e+00 1.515000e+03 -5.000000e+00 0.000000e+00 3.000000e+00 0.000000e+00 2.000000e+00 0.000000e+00 0.000000e+00
75% 1.410702e+06 3.245700e+04 1.410702e+06 3.245700e+04 1.751000e+03 9.000000e+00 2.000000e+01 1.000000e+01 1.930000e+03 9.000000e+00 0.000000e+00 2.000000e+01 0.000000e+00 1.900000e+01 0.000000e+00 2.800000e+01
max 1.501606e+06 3.481900e+04 1.501606e+06 3.481900e+04 2.400000e+03 3.695000e+03 2.270000e+02 1.419000e+03 2.400000e+03 3.680000e+03 1.000000e+00 3.359000e+03 2.692000e+03 1.511000e+03 9.870000e+02 3.581000e+03
In [52]:
df.FL_DATE.max()
Out[52]:
'2024-06-30'
In [53]:
df.nunique()
Out[53]:
FL_DATE                  3653
OP_UNIQUE_CARRIER          20
TAIL_NUM                 9140
ORIGIN_AIRPORT_SEQ_ID      58
ORIGIN_CITY_MARKET_ID      20
ORIGIN                     24
ORIGIN_CITY_NAME           24
DEST_AIRPORT_SEQ_ID        58
DEST_CITY_MARKET_ID        20
DEST                       24
DEST_CITY_NAME             24
DEP_TIME                 1440
DEP_DELAY                1783
TAXI_OUT                  195
TAXI_IN                   247
ARR_TIME                 1440
ARR_DELAY                1806
CANCELLED                   2
CANCELLATION_CODE           4
CARRIER_DELAY            1558
WEATHER_DELAY             971
NAS_DELAY                 855
SECURITY_DELAY            268
LATE_AIRCRAFT_DELAY      1228
dtype: int64
In [54]:
df.dtypes
Out[54]:
FL_DATE                   object
OP_UNIQUE_CARRIER         object
TAIL_NUM                  object
ORIGIN_AIRPORT_SEQ_ID      int64
ORIGIN_CITY_MARKET_ID      int64
ORIGIN                    object
ORIGIN_CITY_NAME          object
DEST_AIRPORT_SEQ_ID        int64
DEST_CITY_MARKET_ID        int64
DEST                      object
DEST_CITY_NAME            object
DEP_TIME                 float64
DEP_DELAY                float64
TAXI_OUT                 float64
TAXI_IN                  float64
ARR_TIME                 float64
ARR_DELAY                float64
CANCELLED                float64
CANCELLATION_CODE         object
CARRIER_DELAY            float64
WEATHER_DELAY            float64
NAS_DELAY                float64
SECURITY_DELAY           float64
LATE_AIRCRAFT_DELAY      float64
dtype: object

Data Cleaning¶

In [55]:
MEDIUM_AIRPORT_CODE = [
    "DAL",
    "PDX",
    "STL",
    "RDU",
    "HOU",
    "SMF",
    "MSY",
    "SJC",
    "SJU",
    "SNA"
]
In [56]:
import matplotlib.pyplot as plt
target_df = df.drop(columns = [
    "OP_UNIQUE_CARRIER",
    "TAIL_NUM",
    "ORIGIN_AIRPORT_SEQ_ID",
    "DEST_AIRPORT_SEQ_ID",
    "DEP_TIME",
    "TAXI_OUT",
    "TAXI_IN",
    "ARR_TIME",
])
del df
"""
dest_df = df.drop(columns = [
    "OP_UNIQUE_CARRIER",
    "TAIL_NUM",
    "ORIGIN_AIRPORT_SEQ_ID",
    "ORIGIN_CITY_MARKET_ID",
    "ORIGIN",
    "ORIGIN_CITY_NAME",
    "DEST_AIRPORT_SEQ_ID",
    "DEST_CITY_MARKET_ID",
    "DEP_TIME",
    "TAXI_OUT",
    "TAXI_IN",
    "ARR_TIME",
])
"""
target_df['FL_DATE'] = pd.to_datetime(target_df['FL_DATE'])
target_df.isnull().mean()
Out[56]:
FL_DATE                  0.000000
ORIGIN_CITY_MARKET_ID    0.000000
ORIGIN                   0.000000
ORIGIN_CITY_NAME         0.000000
DEST_CITY_MARKET_ID      0.000000
DEST                     0.000000
DEST_CITY_NAME           0.000000
DEP_DELAY                0.016377
ARR_DELAY                0.018966
CANCELLED                0.000000
CANCELLATION_CODE        0.983244
CARRIER_DELAY            0.801989
WEATHER_DELAY            0.801989
NAS_DELAY                0.801989
SECURITY_DELAY           0.801989
LATE_AIRCRAFT_DELAY      0.801989
dtype: float64
In [57]:
airport_set_df = target_df.ORIGIN.drop_duplicates()
airport_review = pd.read_csv("../supplementary/Airport-Reviews-Table.csv")
airport_review = airport_review[airport_review.AIRPORT_CODE.isin(airport_set_df)]

airport_default_score = airport_review.groupby("AIRPORT_CODE")[[
    'ratingValue',
    'queueTime', 
    'terminalCleanliness', 
    'terminalSeating', 
    'terminalSign', 
    'foodBeverage', 
    'airportShopping', 
    'wifiConnectivity', 
    'airportStaff', 
    'sentiment'
]].mean().reset_index()

airport_review_count = airport_review.groupby("AIRPORT_CODE").size().reset_index().rename(columns = {0:"reviewCount"})
airport_review_merged = pd.merge(airport_review_count, airport_default_score, how = "inner", on = "AIRPORT_CODE")
airport_review_merged['Large'] = ~airport_review_merged.AIRPORT_CODE.isin(MEDIUM_AIRPORT_CODE)
airport_review_merged.head()
Out[57]:
AIRPORT_CODE reviewCount ratingValue queueTime terminalCleanliness terminalSeating terminalSign foodBeverage airportShopping wifiConnectivity airportStaff sentiment Large
0 ATL 333 0.256456 1.939940 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True
1 CLT 169 0.242012 1.958580 2.349112 2.019481 2.625767 1.951220 2.468468 2.100917 1.947020 -0.715976 True
2 DAL 26 0.450000 3.173913 3.695652 3.150000 3.478261 3.304348 3.090909 3.714286 2.875000 -0.153846 False
3 DEN 151 0.260265 1.847682 2.536424 2.253731 2.213333 2.006757 2.308511 2.557895 1.934307 -0.761589 True
4 DFW 175 0.321143 2.298851 2.908046 2.771429 2.812865 2.179191 2.793478 2.647619 2.102564 -0.531429 True
In [58]:
target_airport_df = pd.merge(
    target_df, 
    airport_review_merged,
    how = "inner", 
    left_on = "ORIGIN", 
    right_on = "AIRPORT_CODE",
    suffixes = ["", "_origin"]
)
target_airport_df = pd.merge(
    target_airport_df, 
    airport_review_merged,
    how = "inner", 
    left_on = "DEST", 
    right_on = "AIRPORT_CODE",
    suffixes = ["", "_dest"]
)
target_airport_df.rename(columns = {
    'reviewCount':'reviewCount_origin', 
    'ratingValue':'ratingValue_origin', 
    'queueTime':'queueTime_origin',
    'terminalCleanliness':'terminalCleanliness_origin', 
    'terminalSeating':'terminalSeating_origin', 
    'terminalSign':'terminalSign_origin',
    'foodBeverage':'foodBeverage_origin', 
    'airportShopping':'airportShopping_origin', 
    'wifiConnectivity':'wifiConnectivity_origin', 
    'airportStaff':'airportStaff_origin',
    'sentiment':'sentiment_origin', 
    'Large':'Large_origin'
}, inplace = True)
target_airport_df.drop(columns = ['AIRPORT_CODE', 'AIRPORT_CODE_dest'], inplace = True)
target_airport_df['CANCELLATION_CODE'] = target_airport_df['CANCELLATION_CODE'].fillna("Not")
target_airport_df.head()
Out[58]:
FL_DATE ORIGIN_CITY_MARKET_ID ORIGIN ORIGIN_CITY_NAME DEST_CITY_MARKET_ID DEST DEST_CITY_NAME DEP_DELAY ARR_DELAY CANCELLED ... queueTime_dest terminalCleanliness_dest terminalSeating_dest terminalSign_dest foodBeverage_dest airportShopping_dest wifiConnectivity_dest airportStaff_dest sentiment_dest Large_dest
0 2014-07-01 31057 CLT Charlotte, NC 30194 DFW Dallas/Fort Worth, TX -5.0 9.0 0.0 ... 2.298851 2.908046 2.771429 2.812865 2.179191 2.793478 2.647619 2.102564 -0.531429 True
1 2014-07-01 30194 DFW Dallas/Fort Worth, TX 31057 CLT Charlotte, NC -4.0 0.0 0.0 ... 1.958580 2.349112 2.019481 2.625767 1.951220 2.468468 2.100917 1.947020 -0.715976 True
2 2014-07-01 30397 ATL Atlanta, GA 30194 DFW Dallas/Fort Worth, TX -2.0 -9.0 0.0 ... 2.298851 2.908046 2.771429 2.812865 2.179191 2.793478 2.647619 2.102564 -0.531429 True
3 2014-07-01 30194 DFW Dallas/Fort Worth, TX 30397 ATL Atlanta, GA 14.0 4.0 0.0 ... 1.939940 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True
4 2014-07-01 30194 DFW Dallas/Fort Worth, TX 30397 ATL Atlanta, GA 1.0 2.0 0.0 ... 1.939940 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True

5 rows × 40 columns

Visualization¶

Airport Review¶

First of all, let's have a look at the Review Data we collected

In [59]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

corr_matrix = airport_review_merged.loc[:, 'reviewCount':].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', mask = mask)
plt.title('Airport Review Score Correlation')
plt.show()
No description has been provided for this image

Summary

The majority of the score given by users are very correlated,
with the exception of reviewCount of an airport, which inversely correlate with all other attributes.

Airport Ranking¶

In [60]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots 
In [61]:
fig = go.Figure()

fig = px.scatter(
    data_frame = airport_review_merged,
    x='sentiment', 
    y='ratingValue',
    color='ratingValue',
    size = 'reviewCount',
    color_continuous_scale = 'viridis',
    facet_col = "Large",
    text='AIRPORT_CODE',
    log_y = True
)
fig.update_traces(textposition='top center')
fig.for_each_annotation(
    lambda a: a.update(text= "Medium Airport" if a.text == "Large=False" else "Large Airport")
)
fig.update_layout(
    height=600, 
    width=1000, 
    title_text="Airport Sentiment Visualization", 
    showlegend=False
)
fig.show()

Review Performance per Airport¶

In [62]:
df_columns = [
    'queueTime',
    'terminalCleanliness',
    'terminalSeating',
    'terminalSign',
    'foodBeverage',
    'airportShopping',
    'wifiConnectivity',
    'airportStaff',
]
fig = make_subplots(rows=4, cols=2, subplot_titles=df_columns)
col = [1, 2]*4
row = [1, 1, 2, 2, 3, 3, 4, 4]
for c, r, column in zip(col, row, df_columns):
    airport_review_merged.sort_values(column, ascending = False, inplace = True)
    trace = go.Bar(
        x=airport_review_merged['AIRPORT_CODE'], 
        y=airport_review_merged[column],
        marker=dict(
            color = airport_review_merged[column],
            colorscale='viridis'
        )
    )
    fig.add_trace(
        trace, 
        row=r, 
        col=c
    )
fig.update_layout(
    height=1000, 
    title_text="Individual Scoring Attributes of Airports",
    showlegend=False
)
fig.show()

Delay vs Ranking¶

In [63]:
train_columns = [
    'queueTime_origin',
    'terminalCleanliness_origin',
    'terminalSeating_origin',
    'terminalSign_origin',
    'foodBeverage_origin',
    'airportShopping_origin',
    'wifiConnectivity_origin',
    'airportStaff_origin',
    'Large_origin',
    'queueTime_dest',
    'terminalCleanliness_dest',
    'terminalSeating_dest',
    'terminalSign_dest',
    'foodBeverage_dest',
    'airportShopping_dest',
    'wifiConnectivity_dest',
    'airportStaff_dest',
    'Large_dest'
]
label_columns = [
    "DEP_DELAY",
    "ARR_DELAY",
    "CARRIER_DELAY",
    "WEATHER_DELAY",
    "NAS_DELAY",
    "SECURITY_DELAY",
    "LATE_AIRCRAFT_DELAY",
]

Delay Data Size >0

In [64]:
cal_df = target_airport_df[['ORIGIN'] + label_columns]
cal_df.groupby('ORIGIN')[label_columns].apply(lambda x: (x > 0).mean()).reset_index()
Out[64]:
ORIGIN DEP_DELAY ARR_DELAY CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY
0 ATL 0.391977 0.346680 0.102059 0.012486 0.098133 0.000559 0.070856
1 CLT 0.386671 0.377364 0.117649 0.012658 0.119450 0.002985 0.094161
2 DAL 0.511887 0.405451 0.138866 0.010460 0.092957 0.001003 0.135679
3 DEN 0.442719 0.386610 0.127630 0.010701 0.112139 0.000455 0.110875
4 DFW 0.417090 0.400279 0.137167 0.018804 0.118154 0.000864 0.105878
5 EWR 0.401861 0.362531 0.133538 0.011004 0.122859 0.000979 0.107049
6 HOU 0.461493 0.377129 0.120927 0.009179 0.084441 0.000970 0.115730
7 JFK 0.357083 0.337509 0.119247 0.010789 0.129858 0.001389 0.068300
8 LAS 0.400518 0.375535 0.111114 0.005143 0.108687 0.000699 0.113227
9 LAX 0.384964 0.363739 0.104485 0.004377 0.102161 0.000770 0.096527
10 MCO 0.390880 0.372180 0.110292 0.020802 0.120200 0.001602 0.106974
11 MIA 0.405724 0.393572 0.136413 0.019572 0.126643 0.001830 0.099147
12 MSY 0.360981 0.338699 0.087303 0.010217 0.093822 0.000673 0.106858
13 ORD 0.405576 0.382400 0.121694 0.021180 0.129221 0.000579 0.098953
14 PDX 0.298812 0.321973 0.066115 0.004676 0.084625 0.000397 0.068284
15 PHX 0.392451 0.362283 0.101458 0.003507 0.090475 0.001300 0.098925
16 RDU 0.344065 0.332790 0.086812 0.008671 0.104936 0.000614 0.091356
17 SEA 0.351422 0.362239 0.082688 0.006774 0.105163 0.000757 0.068566
18 SFO 0.369786 0.367693 0.098870 0.004128 0.108372 0.000497 0.107824
19 SJC 0.340300 0.339721 0.079905 0.003123 0.076541 0.000883 0.098579
20 SJU 0.347668 0.355347 0.117447 0.002182 0.102229 0.001834 0.121220
21 SMF 0.358445 0.343033 0.077687 0.003122 0.080243 0.000597 0.100190
22 SNA 0.352813 0.332351 0.068681 0.004160 0.083964 0.000328 0.098166
23 STL 0.390224 0.347997 0.098000 0.008861 0.093257 0.000460 0.101216
In [65]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

corr_matrix = target_airport_df.loc[:, train_columns + label_columns].corr()

plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Delays')
plt.show()

plt.figure(figsize=(10, 5))
label_columns.remove('NAS_DELAY')
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Delays (Without NAS)')
plt.show()
No description has been provided for this image
No description has been provided for this image

Departure Delay Per Airport¶

In [66]:
for airport_direction in ["ORIGIN", "DEST"]:
    airport_temp = target_airport_df.sample(1000000, replace = False).groupby(airport_direction)['DEP_DELAY'].quantile(0.75).reset_index()
    airport_temp_merged = pd.merge(
        target_airport_df.sample(100000, replace = False), 
        airport_temp, how = "inner", 
        on = airport_direction, 
        suffixes = ["", "_mean"]
    )
    airport_temp_merged['DEP_DELAY'] = airport_temp_merged['DEP_DELAY'].apply(lambda x: 0 if x < 0 else x)
    fig = px.box(
        airport_temp_merged.sort_values([
            f"Large_{airport_direction.lower()}", 
            "DEP_DELAY_mean"
        ], ascending = False), 
        x=airport_direction, 
        y="DEP_DELAY",
        log_y = True,
        color = f"Large_{airport_direction.lower()}",
        color_discrete_map = {True: 'red', False: 'blue'}
    )
    fig.update_layout(
        height=400, 
        title_text=f"{airport_direction} Aiport Departure Delay",
        showlegend=True
    )
    fig.show()

Cancelled¶

In [67]:
train_columns = [
    'queueTime_origin',
    'terminalCleanliness_origin',
    'terminalSeating_origin',
    'terminalSign_origin',
    'foodBeverage_origin',
    'airportShopping_origin',
    'wifiConnectivity_origin',
    'airportStaff_origin',
    'Large_origin',
    'queueTime_dest',
    'terminalCleanliness_dest',
    'terminalSeating_dest',
    'terminalSign_dest',
    'foodBeverage_dest',
    'airportShopping_dest',
    'wifiConnectivity_dest',
    'airportStaff_dest',
    'Large_dest'
]
label_columns = [
    "Cancellation_A",
    "Cancellation_B",
    "Cancellation_C",
    "Cancellation_D",
    "Cancellation_Not"
]
In [69]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

temp = pd.concat([
    target_airport_df, 
    pd.get_dummies(target_airport_df['CANCELLATION_CODE'], prefix = "Cancellation")
], axis = 1)

corr_matrix = temp.loc[:, train_columns + label_columns].corr()

plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Delays')
plt.show()
No description has been provided for this image
In [70]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tqdm import tqdm
import numpy as np
import math
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize

def preprocess(df, y_columns='ARR_DELAY'):
    X = df.loc[:, train_columns]
    y = df.loc[:, y_columns]
    return X, y
In [ ]:
train, test = train_test_split(target_airport_df, test_size=0.2)
In [72]:
train_X, train_y = preprocess(train, "CANCELLATION_CODE")
test_X, test_y = preprocess(test, "CANCELLATION_CODE")

min_max_scaler = preprocessing.MinMaxScaler()
train_X_scaled = min_max_scaler.fit_transform(train_X)
test_X_scaled = min_max_scaler.transform(test_X)
In [73]:
model = LogisticRegression(
    multi_class='ovr', 
    max_iter=1000,
    class_weight="balanced"
)
model.fit(train_X_scaled, train_y)
C:\Users\wongh\anaconda3\envs\601\lib\site-packages\sklearn\linear_model\_logistic.py:1256: FutureWarning:

'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.

Out[73]:
LogisticRegression(class_weight='balanced', max_iter=1000, multi_class='ovr')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(class_weight='balanced', max_iter=1000, multi_class='ovr')
In [74]:
import seaborn as sn
import matplotlib.pyplot as plt

train_predictions = model.predict(train_X_scaled)
test_predictions = model.predict(test_X_scaled)

train_conf_matrix = confusion_matrix(train_y, train_predictions)
test_conf_matrix = confusion_matrix(test_y, test_predictions)

sn.heatmap(train_conf_matrix, annot=True, fmt='g')
plt.show()

sn.heatmap(test_conf_matrix, annot=True, fmt='g')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [76]:
train_y_encoded = label_binarize(train_y, classes=model.classes_)
test_y_encoded = label_binarize(test_y, classes=model.classes_)

train_probs = model.predict_proba(train_X_scaled)
test_probs = model.predict_proba(test_X_scaled)

fpr_train, tpr_train, _ = roc_curve(train_y_encoded.ravel(), train_probs.ravel())
fpr_test, tpr_test, _ = roc_curve(test_y_encoded.ravel(), test_probs.ravel())

train_auc = roc_auc_score(train_y_encoded, train_probs, average='micro')
test_auc = roc_auc_score(test_y_encoded, test_probs, average='micro')
In [77]:
def roc(fpr, tpr, auc):
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(
        fpr, 
        tpr, 
        label=f'Train ROC curve (area = {auc:.2})'
    )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Train ROC Curve')
    plt.legend(loc="lower right")

roc(fpr_train, tpr_train, train_auc)
roc(fpr_test, tpr_test, test_auc)
No description has been provided for this image
No description has been provided for this image
In [78]:
weight = model.coef_

fig, axs = plt.subplots(5, 1, figsize=(10, 20))

for i in range(5):
    ax = axs[i]
    ax.barh(train_columns, weight[i])
    ax.set_title(f'Bar Chart {label_columns[i]}')

plt.tight_layout()
plt.show()
No description has been provided for this image

from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn import preprocessing from tqdm import tqdm import numpy as np import math

def preprocess(df, y_columns = 'ARR_DELAY'): df.dropna(subset = y_columns, inplace = True) X = df.loc[:, train_columns] y = df.loc[:, y_columns] return X, y

def experiment(df, y_columns = 'ARR_DELAY'): train, test = train_test_split(df, test_size=0.2)

train_X, train_y = preprocess(train, y_columns)
test_X, test_y = preprocess(test, y_columns)

min_max_scaler = preprocessing.MinMaxScaler()
train_X_scaled = min_max_scaler.fit_transform(train_X)
test_X_scaled = min_max_scaler.transform(test_X)

model = LinearRegression()
model.fit(train_X_scaled, train_y)

train_r_score = model.score(train_X_scaled, train_y)
test_r_score = model.score(test_X_scaled, test_y)

return train_r_score, test_r_score, model.coef_

Redacted¶

from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn import preprocessing from tqdm import tqdm import numpy as np import math

def preprocess(df, y_columns = 'ARR_DELAY'): df.dropna(subset = y_columns, inplace = True) X = df.loc[:, train_columns] y = df.loc[:, y_columns] return X, y

def experiment(df, y_columns = 'ARR_DELAY'): train, test = train_test_split(df, test_size=0.2)

train_X, train_y = preprocess(train, y_columns)
test_X, test_y = preprocess(test, y_columns)

min_max_scaler = preprocessing.MinMaxScaler()
train_X_scaled = min_max_scaler.fit_transform(train_X)
test_X_scaled = min_max_scaler.transform(test_X)

model = LinearRegression()
model.fit(train_X_scaled, train_y)

train_r_score = model.score(train_X_scaled, train_y)
test_r_score = model.score(test_X_scaled, test_y)

return train_r_score, test_r_score, model.coef_

Arrival Delay Reasoning¶

train_r_list, test_r_list, coef_list = list(), list(), list() for i in tqdm(range(EXPERIMENT_COUNT)): np.random.seed(i) train_r_score, test_r_score, coef = experiment(target_airport_df, 'LOG_ARR_DELAY') train_r_list.append(train_r_score) test_r_list.append(test_r_score) coef_list.append(coef)

import plotly.express as px

coef_df_list = list()

for i, array in enumerate(coef_list): coef_df = pd.DataFrame({ "Feature": train_columns, "Weight": array }) coef_df_list.append(coef_df)

result_df = pd.concat(coef_df_list)

fig = px.box( result_df, x="Feature", y="Weight", ) fig.update_layout( height=600, title_text="Aiport Rating vs Arrival Delay", showlegend=False ) fig.show()

In [19]:
train_r_list, test_r_list
Out[19]:
([0.0027769056925236457,
  0.002726794565260926,
  0.002739984286952546,
  0.0027382477593851373,
  0.002750673837592532,
  0.0027616015050112575,
  0.002722193250161431,
  0.002763664872232119,
  0.0027681168845542503,
  0.002773670861719535],
 [0.0026226756634710746,
  0.0028214286965265734,
  0.0027696415080686876,
  0.002776677393967586,
  0.0027283399081202697,
  0.0026838710483376715,
  0.0028416825308008598,
  0.0026738151940109978,
  0.00265653353537032,
  0.002636301651983919])

Departure Delay Reasoning¶

train_r_list, test_r_list, coef_list = list(), list(), list() for i in tqdm(range(EXPERIMENT_COUNT)): np.random.seed(i) train_r_score, test_r_score, coef = experiment(target_airport_df, 'DEP_DELAY') train_r_list.append(train_r_score) test_r_list.append(test_r_score) coef_list.append(coef)

import plotly.express as px

coef_df_list = list()

for i, array in enumerate(coef_list): coef_df = pd.DataFrame({ "Feature": train_columns, "Weight": array }) coef_df_list.append(coef_df)

result_df = pd.concat(coef_df_list)

fig = px.box( result_df, x="Feature", y="Weight", ) fig.update_layout( height=600, title_text="Aiport Rating vs Departure Delay", showlegend=False ) fig.show()

In [ ]:
 

import pandas as pd import plotly.graph_objects as go import plotly.express as px

fig = go.Figure()

coef_df.sort_values("Weight", ascending = False, inplace = True) fig = px.bar( data_frame = coef_df, x='Feature', y='Weight', color='Weight', color_continuous_scale = 'viridis', ) fig.update_layout( height=600, width=1000, title_text="Airport Sentiment Visualization", showlegend=False ) fig.show()

import math

target_airport_df["LOG_ARR_DELAY"] = target_airport_df["ARR_DELAY"].apply(lambda x: 0 if x <= 0 else math.log(x)) target_airport_df["LOG_DEP_DELAY"] = target_airport_df["DEP_DELAY"].apply(lambda x: 0 if x <= 0 else math.log(x)) target_airport_df["LOG_ARR_DELAY"]

In [ ]: